
/***************************************************************************
                                                                           *
Copyright 2013 CertiVox UK Ltd.                                           *
                                                                           *
This file is part of CertiVox MIRACL Crypto SDK.                           *
                                                                           *
The CertiVox MIRACL Crypto SDK provides developers with an                 *
extensive and efficient set of cryptographic functions.                    *
For further information about its features and functionalities please      *
refer to http://www.certivox.com                                           *
                                                                           *
* The CertiVox MIRACL Crypto SDK is free software: you can                 *
  redistribute it and/or modify it under the terms of the                  *
  GNU Affero General Public License as published by the                    *
  Free Software Foundation, either version 3 of the License,               *
  or (at your option) any later version.                                   *
                                                                           *
* The CertiVox MIRACL Crypto SDK is distributed in the hope                *
  that it will be useful, but WITHOUT ANY WARRANTY; without even the       *
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
  See the GNU Affero General Public License for more details.              *
                                                                           *
* You should have received a copy of the GNU Affero General Public         *
  License along with CertiVox MIRACL Crypto SDK.                           *
  If not, see <http://www.gnu.org/licenses/>.                              *
                                                                           *
You can be released from the requirements of the license by purchasing     *
a commercial license. Buying such a license is mandatory as soon as you    *
develop commercial activities involving the CertiVox MIRACL Crypto SDK     *
without disclosing the source code of your own applications, or shipping   *
the CertiVox MIRACL Crypto SDK with a closed source product.               *
                                                                           *
***************************************************************************/
/*
 *   MIRACL routines for arithmetic over GF(2^m),
 *   mrgf2m.c
 *
 *   For algorithms used, see IEEE P1363 Standard, Appendix A
 *   unless otherwise stated.
 *
 *   The time-critical routines are the multiplication routine multiply2()
 *   and (for AFFINE co-ordinates), the modular inverse routine inverse2()
 *   and the routines it calls.
 *
 *   READ COMMENTS CAREFULLY FOR VARIOUS OPTIMIZATION SUGGESTIONS
 *
 *   No assembly language used.
 *
 *   Use utility irp.cpp to generate optimal code for function reduce2(.) below
 *
 *   Space can be saved by removing unneeded functions and
 *   deleting unrequired functionality.
 *   For example in reduce2(.) remove code for those irreducible polynomials
 *   which will not be used by your code.
 */

#include <stdlib.h>

#include "miracl.h"
#ifdef MR_STATIC
#include <string.h>
#endif

#ifdef MR_COUNT_OPS
extern int fpm2, fpi2;
#endif

/* must use /arch:SSE2 in compilation */

#ifdef _M_IX86_FP
#if _M_IX86_FP >= 2
#define MR_SSE2_INTRINSICS
#endif
#endif

/* must use -msse2 in compilation */

#ifdef __SSE2__
#define MR_SSE2_INTRINSICS
#endif

#ifdef MR_SSE2_INTRINSICS
#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif

#if MIRACL == 64
#define MR_SSE2_64
/* Can use SSE2 registers for 64-bit manipulations */
#endif

#endif

#ifndef MR_NOFULLWIDTH
/* This does not make sense using floating-point! */

/* This is extremely time-critical, and expensive */

/* Some experimental MMX code for x86-32. Seems to be slower than the standard
 * code (on a PIV anyway).. */

#ifdef MR_MMX_x86_32

#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif

static mr_small mr_mul2(mr_small a, mr_small b, mr_small* r)
{
    __m64    rg, tt[4];
    mr_small q;

    tt[0] = _m_from_int(0);
    tt[1] = _m_from_int(a);
    tt[2] = _m_psllqi(tt[1], 1);
    tt[3] = _m_pxor(tt[1], tt[2]);

    rg = tt[b & 3];
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 2) & 3], 2));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 4) & 3], 4));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 6) & 3], 6));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 8) & 3], 8));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 10) & 3], 10));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 12) & 3], 12));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 14) & 3], 14));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 16) & 3], 16));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 18) & 3], 18));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 20) & 3], 20));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 22) & 3], 22));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 24) & 3], 24));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 26) & 3], 26));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 28) & 3], 28));
    rg = _m_pxor(rg, _m_psllqi(tt[(b >> 30)], 30));

    *r = _m_to_int(rg);
    q  = _m_to_int(_m_psrlqi(rg, 32));

    return q;
}

#else

/* This might be faster on a 16-bit processor with no variable shift
instructions. The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15); is just a 1-bit
right shift on the hi|lo value - should be really fast in assembly language

unsigned short mr_mul2(unsigned short x,unsigned short y,unsigned short *r)
{
    unsigned short lo,hi,bit,w;
    hi=0;
    lo=x;
    bit=-(lo&1);
    lo>>=1;

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    hi^=(y&bit);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<15);

    *r=lo;
    return hi;
}

*/

/* This might be faster on an 8-bit processor with no variable shift
instructions. The line w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7); is just a 1-bit right
shift on the hi|lo value - should be really fast in assembly language

unsigned char mr_mul2(unsigned char x,unsigned char y,unsigned char *r)
{
    unsigned char lo,hi,bit,w;
    hi=0;
    lo=x;
    bit=-(lo&1);
    lo>>=1;

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);    bit=-(lo&1);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    hi^=(y&bit);
    w=hi&1; hi>>=1; lo>>=1; lo|=(w<<7);

    *r=lo;
    return hi;
}

*/

/* wouldn't it be nice if instruction sets supported a
   one cycle "carry-free" multiplication instruction ...
   The SmartMips does - its called maddp                 */

#ifndef MR_COMBA2

#if MIRACL == 8

/* maybe use a small precomputed look-up table? */

static mr_small mr_mul2(mr_small a, mr_small b, mr_small* r)
{
    static const mr_small look[256] = {
        0, 0,  0,  0,  0,  0,  0,  0,  0,   0,   0,   0,   0,   0,   0,   0,
        0, 1,  2,  3,  4,  5,  6,  7,  8,   9,   10,  11,  12,  13,  14,  15,
        0, 2,  4,  6,  8,  10, 12, 14, 16,  18,  20,  22,  24,  26,  28,  30,
        0, 3,  6,  5,  12, 15, 10, 9,  24,  27,  30,  29,  20,  23,  18,  17,
        0, 4,  8,  12, 16, 20, 24, 28, 32,  36,  40,  44,  48,  52,  56,  60,
        0, 5,  10, 15, 20, 17, 30, 27, 40,  45,  34,  39,  60,  57,  54,  51,
        0, 6,  12, 10, 24, 30, 20, 18, 48,  54,  60,  58,  40,  46,  36,  34,
        0, 7,  14, 9,  28, 27, 18, 21, 56,  63,  54,  49,  36,  35,  42,  45,
        0, 8,  16, 24, 32, 40, 48, 56, 64,  72,  80,  88,  96,  104, 112, 120,
        0, 9,  18, 27, 36, 45, 54, 63, 72,  65,  90,  83,  108, 101, 126, 119,
        0, 10, 20, 30, 40, 34, 60, 54, 80,  90,  68,  78,  120, 114, 108, 102,
        0, 11, 22, 29, 44, 39, 58, 49, 88,  83,  78,  69,  116, 127, 98,  105,
        0, 12, 24, 20, 48, 60, 40, 36, 96,  108, 120, 116, 80,  92,  72,  68,
        0, 13, 26, 23, 52, 57, 46, 35, 104, 101, 114, 127, 92,  81,  70,  75,
        0, 14, 28, 18, 56, 54, 36, 42, 112, 126, 108, 98,  72,  70,  84,  90,
        0, 15, 30, 17, 60, 51, 34, 45, 120, 119, 102, 105, 68,  75,  90,  85};

    mr_small x1, y0, m, p, q;
    x1 = a & 0xf0;
    y0 = b & 0x0f;
    a <<= 4;
    b >>= 4;

    p = look[(a | y0)];
    q = look[(x1 | b)];

    m = look[a ^ b ^ x1 ^ y0] ^ p ^ q; /* Karatsuba! */

    p ^= (m << 4);
    q ^= (m >> 4);

    *r = p;
    return q;
}

#else

#ifdef MR_SSE2_64

static mr_small mr_mul2(mr_small a, mr_small b, mr_small* r)
{
    int     i, j;
    __m128i pp, tt[16], m;

    m = _mm_set_epi32(0, 0, 0xf0 << 24, 0);

    tt[0] = _mm_setzero_si128();
    tt[1] = _mm_loadl_epi64((__m128i*)&a);
    tt[2] = _mm_xor_si128(
        _mm_slli_epi64(tt[1], 1),
        _mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m, tt[1]), 1), 7));
    tt[4] = _mm_xor_si128(
        _mm_slli_epi64(tt[1], 2),
        _mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m, tt[1]), 1), 6));
    tt[8] = _mm_xor_si128(
        _mm_slli_epi64(tt[1], 3),
        _mm_srli_epi64(_mm_slli_si128(_mm_and_si128(m, tt[1]), 1), 5));
    tt[3]  = _mm_xor_si128(tt[1], tt[2]);
    tt[5]  = _mm_xor_si128(tt[1], tt[4]);
    tt[6]  = _mm_xor_si128(tt[2], tt[4]);
    tt[7]  = _mm_xor_si128(tt[6], tt[1]);
    tt[9]  = _mm_xor_si128(tt[8], tt[1]);
    tt[10] = _mm_xor_si128(tt[8], tt[2]);
    tt[11] = _mm_xor_si128(tt[10], tt[1]);
    tt[12] = _mm_xor_si128(tt[8], tt[4]);
    tt[13] = _mm_xor_si128(tt[12], tt[1]);
    tt[14] = _mm_xor_si128(tt[8], tt[6]);
    tt[15] = _mm_xor_si128(tt[14], tt[1]);

    /* Thanks to Darrel Hankerson, who pointed out an optimization for this code
     * ... */

    i  = (int)(b & 0xF);
    j  = (int)((b >> 4) & 0xF);
    pp = _mm_xor_si128(
        tt[i], _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                            _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60)));
    i = (int)((b >> 8) & 0xF);
    j = (int)((b >> 12) & 0xF);

    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                1));
    i  = (int)((b >> 16) & 0xF);
    j  = (int)((b >> 20) & 0xF);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                2));
    i  = (int)((b >> 24) & 0xF);
    j  = (int)((b >> 28) & 0xF);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                3));
    i  = (int)((b >> 32) & 0xF);
    j  = (int)((b >> 36) & 0xF);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                4));
    i  = (int)((b >> 40) & 0xF);
    j  = (int)((b >> 44) & 0xF);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                5));
    i  = (int)((b >> 48) & 0xF);
    j  = (int)((b >> 52) & 0xF);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                6));
    i  = (int)((b >> 56) & 0xF);
    j  = (int)(b >> 60);
    pp = _mm_xor_si128(
        pp, _mm_slli_si128(
                _mm_xor_si128(
                    tt[i],
                    _mm_or_si128(_mm_slli_epi64(tt[j], 4),
                                 _mm_srli_epi64(_mm_slli_si128(tt[j], 8), 60))),
                7));

    *r = ((unsigned long long*)&pp)[0];
    return ((unsigned long long*)&pp)[1];
}

#else

static mr_small mr_mul2(mr_small a, mr_small b, mr_small* r)
{
    // int k;
    mr_small kb, t[16];
    mr_small x, q, p;
    mr_utype tb0;
#if MIRACL > 32
    mr_utype tb1, tb2;
#endif

    kb   = b;

#if MIRACL <= 32
    t[0] = 0;             /* small look up table */
    t[3] = t[2] = a << 1; /* it can overflow.... */
    t[1]        = t[2] >> 1;
    t[3] ^= t[1];

    tb0 = (mr_utype)(a & TOPBIT); /* remember top bit    */
    tb0 >>= M1;                   /* all ones if top bit is one */
#else
    t[0] = 0; /* larger look-up table */
    t[8] = a << 3;
    t[4] = t[8] >> 1;
    t[2] = t[4] >> 1;
    t[1] = t[2] >> 1;
    t[3] = t[5] = t[7] = t[9] = t[11] = t[13] = t[15] = t[1];
    t[3] ^= t[2];
    t[5] ^= t[4];
    t[9] ^= t[8];
    t[6] = t[3] << 1;
    t[7] ^= t[6];
    t[10] = t[5] << 1;
    t[11] ^= t[10];
    t[12] = t[6] << 1;
    t[13] ^= t[12];
    t[14] = t[7] << 1;
    t[15] ^= t[14];

    tb0 = (a & TOPBIT); /* remember top bits  */
    tb0 >>= M1;         /* all bits one, if this bit is set in a */
    tb1 = (a & SECBIT) << 1;
    tb1 >>= M1;
    tb2 = (a & THDBIT) << 2;
    tb2 >>= M1;
#endif

#if MIRACL == 8
#define UNWOUNDM
    p = q = t[b & 3];
    q >>= 2;
    x = t[(b >> 2) & 3];
    q ^= x;
    p ^= (x << 2);
    q >>= 2;
    x = t[(b >> 4) & 3];
    q ^= x;
    p ^= (x << 4);
    q >>= 2;
    x = t[(b >> 6)];
    q ^= x;
    p ^= (x << 6);
    q >>= 2;
#endif

#if MIRACL == 16
#define UNWOUNDM
    p = q = t[b & 3];
    q >>= 2;
    x = t[(b >> 2) & 3];
    q ^= x;
    p ^= (x << 2);
    q >>= 2;
    x = t[(b >> 4) & 3];
    q ^= x;
    p ^= (x << 4);
    q >>= 2;
    x = t[(b >> 6) & 3];
    q ^= x;
    p ^= (x << 6);
    q >>= 2;
    x = t[(b >> 8) & 3];
    q ^= x;
    p ^= (x << 8);
    q >>= 2;
    x = t[(b >> 10) & 3];
    q ^= x;
    p ^= (x << 10);
    q >>= 2;
    x = t[(b >> 12) & 3];
    q ^= x;
    p ^= (x << 12);
    q >>= 2;
    x = t[(b >> 14)];
    q ^= x;
    p ^= (x << 14);
    q >>= 2;
#endif

#if MIRACL == 32
#define UNWOUNDM
    p = q = t[b & 3];
    q >>= 2;
    x = t[(b >> 2) & 3];
    q ^= x;
    p ^= (x << 2);
    q >>= 2; /* 8 ASM 80386 instructions */
    x = t[(b >> 4) & 3];
    q ^= x;
    p ^= (x << 4);
    q >>= 2; /* but only 4 ARM instructions! */
    x = t[(b >> 6) & 3];
    q ^= x;
    p ^= (x << 6);
    q >>= 2;
    x = t[(b >> 8) & 3];
    q ^= x;
    p ^= (x << 8);
    q >>= 2;
    x = t[(b >> 10) & 3];
    q ^= x;
    p ^= (x << 10);
    q >>= 2;
    x = t[(b >> 12) & 3];
    q ^= x;
    p ^= (x << 12);
    q >>= 2;
    x = t[(b >> 14) & 3];
    q ^= x;
    p ^= (x << 14);
    q >>= 2;
    x = t[(b >> 16) & 3];
    q ^= x;
    p ^= (x << 16);
    q >>= 2;
    x = t[(b >> 18) & 3];
    q ^= x;
    p ^= (x << 18);
    q >>= 2;
    x = t[(b >> 20) & 3];
    q ^= x;
    p ^= (x << 20);
    q >>= 2;
    x = t[(b >> 22) & 3];
    q ^= x;
    p ^= (x << 22);
    q >>= 2;
    x = t[(b >> 24) & 3];
    q ^= x;
    p ^= (x << 24);
    q >>= 2;
    x = t[(b >> 26) & 3];
    q ^= x;
    p ^= (x << 26);
    q >>= 2;
    x = t[(b >> 28) & 3];
    q ^= x;
    p ^= (x << 28);
    q >>= 2;
    x = t[(b >> 30)];
    q ^= x;
    p ^= (x << 30);
    q >>= 2;
#endif

#if MIRACL == 64
#define UNWOUNDM
    p = q = t[b & 0xf];
    q >>= 4;
    x = t[(b >> 4) & 0xf];
    q ^= x;
    p ^= (x << 4);
    q >>= 4;
    x = t[(b >> 8) & 0xf];
    q ^= x;
    p ^= (x << 8);
    q >>= 4;
    x = t[(b >> 12) & 0xf];
    q ^= x;
    p ^= (x << 12);
    q >>= 4;
    x = t[(b >> 16) & 0xf];
    q ^= x;
    p ^= (x << 16);
    q >>= 4;
    x = t[(b >> 20) & 0xf];
    q ^= x;
    p ^= (x << 20);
    q >>= 4;
    x = t[(b >> 24) & 0xf];
    q ^= x;
    p ^= (x << 24);
    q >>= 4;
    x = t[(b >> 28) & 0xf];
    q ^= x;
    p ^= (x << 28);
    q >>= 4;
    x = t[(b >> 32) & 0xf];
    q ^= x;
    p ^= (x << 32);
    q >>= 4;
    x = t[(b >> 36) & 0xf];
    q ^= x;
    p ^= (x << 36);
    q >>= 4;
    x = t[(b >> 40) & 0xf];
    q ^= x;
    p ^= (x << 40);
    q >>= 4;
    x = t[(b >> 44) & 0xf];
    q ^= x;
    p ^= (x << 44);
    q >>= 4;
    x = t[(b >> 48) & 0xf];
    q ^= x;
    p ^= (x << 48);
    q >>= 4;
    x = t[(b >> 52) & 0xf];
    q ^= x;
    p ^= (x << 52);
    q >>= 4;
    x = t[(b >> 56) & 0xf];
    q ^= x;
    p ^= (x << 56);
    q >>= 4;
    x = t[(b >> 60)];
    q ^= x;
    p ^= (x << 60);
    q >>= 4;

#endif

#ifndef UNWOUNDM

    q = p = (mr_small)0;
    for (k = 0; k < MIRACL; k += 8) {
        q ^= (t[b & 3]);
        b >>= 2;
        p >>= 2;
        p |= q << M2;
        q >>= 2;

        q ^= (t[b & 3]);
        b >>= 2;
        p >>= 2;
        p |= q << M2;
        q >>= 2;

        q ^= (t[b & 3]);
        b >>= 2;
        p >>= 2;
        p |= q << M2;
        q >>= 2;

        q ^= (t[b & 3]);
        b >>= 2;
        p >>= 2;
        p |= q << M2;
        q >>= 2;
    }
#endif

#if MIRACL <= 32
    p ^= (tb0 & (kb << M1)); /* compensate for top bit */
    q ^= (tb0 & (kb >> 1));  /* don't break pipeline.. */
#else
    p ^= (tb0 & (kb << M1));
    q ^= (tb0 & (kb >> 1));
    p ^= (tb1 & (kb << M2));
    q ^= (tb1 & (kb >> 2));
    p ^= (tb2 & (kb << M3));
    q ^= (tb2 & (kb >> 3));
#endif

    *r = p;
    return q;
}

#endif

#endif

#endif

#endif

static int numbits(big x)
{ /* return degree of x */
    mr_small *gx = x->w, bit = TOPBIT;
    int       m, k           = x->len;
    if (k == 0) return 0;
    m = k * MIRACL;
    while (!(gx[k - 1] & bit)) {
        m--;
        bit >>= 1;
    }
    return m;
}

int degree2(big x)
{ /* returns -1 for x=0 */
    return (numbits(x) - 1);
}

/*
static int zerobits(big x)
{
    int m,n,k;
    mr_small *gx,lsb,bit=1;
    k=x->len;
    if (k==0) return (-1);
    gx=x->w;
    for (m=0;m<k;m++)
    {
        if (gx[m]==0) continue;
        n=0;
        lsb=gx[m];
        while (!(bit&lsb))
        {
            n++;
            bit<<=1;
        }
        break;
    }
    return (MIRACL*m+n);
}

static void shiftrightbits(big x,int m)
{
    int i,k=x->len;
    int w=m/MIRACL;
    int b=m%MIRACL;
    mr_small *gx=x->w;
    if (k==0 || m==0) return;
    if (w>0)
    {
        for (i=0;i<k-w;i++)
            gx[i]=gx[i+w];
        for (i=k-w;i<k;i++) gx[i]=0;
        x->len-=w;
    }
    if (b!=0)
    {
        for (i=0;i<k-w-1;i++)
            gx[i]=(gx[i]>>b)|(gx[i+1]<<(MIRACL-b));
        gx[k-w-1]>>=b;
        if (gx[k-w-1]==0) x->len--;
    }
}
*/

static void shiftleftbits(big x, int m)
{
    int       i, k = x->len;
    mr_small  j;
    int       w  = m / MIRACL; /* words */
    int       b  = m % MIRACL; /* bits  */
    mr_small* gx = x->w;
    if (k == 0 || m == 0) return;
    if (w > 0) {
        for (i = k + w - 1; i >= w; i--) gx[i] = gx[i - w];
        for (i = w - 1; i >= 0; i--) gx[i] = 0;
        x->len += w;
    }
    /* time critical */
    if (b != 0) {
        j = gx[k + w - 1] >> (MIRACL - b);
        if (j != 0) {
            x->len++;
            gx[k + w] = j;
        }
        for (i = k + w - 1; i > w; i--) {
            gx[i] = (gx[i] << b) | (gx[i - 1] >> (MIRACL - b));
        }
        gx[w] <<= b;
    }
}

static void square2(big x, big w)
{ /* w=x*x where x can be NULL so be careful */
    int i, n, m;
    // int i,j,n,m;
    mr_small a, *gw;
    // mr_small a,t,r,*gw;

    static const mr_small look[16] = {0,
                                      (mr_small)1 << M8,
                                      (mr_small)4 << M8,
                                      (mr_small)5 << M8,
                                      (mr_small)16 << M8,
                                      (mr_small)17 << M8,
                                      (mr_small)20 << M8,
                                      (mr_small)21 << M8,
                                      (mr_small)64 << M8,
                                      (mr_small)65 << M8,
                                      (mr_small)68 << M8,
                                      (mr_small)69 << M8,
                                      (mr_small)80 << M8,
                                      (mr_small)81 << M8,
                                      (mr_small)84 << M8,
                                      (mr_small)85 << M8};

    if (x != w) copy(x, w);
    n = w->len;
    if (n == 0) return;
    m      = n + n;
    w->len = m;
    gw     = w->w;

    for (i = n - 1; i >= 0; i--) {
        a = gw[i];

#if MIRACL == 8
#define UNWOUNDS
        gw[i + i]     = look[a & 0xF];
        gw[i + i + 1] = look[(a >> 4)];
#endif

#if MIRACL == 16
#define UNWOUNDS
        gw[i + i]     = (look[a & 0xF] >> 8) | look[(a >> 4) & 0xF];
        gw[i + i + 1] = (look[(a >> 8) & 0xF] >> 8) | look[(a >> 12)];
#endif

#if MIRACL == 32
#define UNWOUNDS
        gw[i + i] = (look[a & 0xF] >> 24) | (look[(a >> 4) & 0xF] >> 16) |
                    (look[(a >> 8) & 0xF] >> 8) | look[(a >> 12) & 0xF];
        gw[i + i + 1] = (look[(a >> 16) & 0xF] >> 24) |
                        (look[(a >> 20) & 0xF] >> 16) |
                        (look[(a >> 24) & 0xF] >> 8) | look[(a >> 28)];
#endif

#ifndef UNWOUNDS

        r = 0;
        for (j = 0; j < MIRACL / 2; j += 4) {
            t = look[a & 0xF];
            a >>= 4;
            r >>= 8;
            r |= t;
        }
        gw[i + i] = r;
        r         = 0;

        for (j = 0; j < MIRACL / 2; j += 4) {
            t = look[a & 0xF];
            a >>= 4;
            r >>= 8;
            r |= t;
        }
        gw[i + i + 1] = r;

#endif
    }
    if (gw[m - 1] == 0) {
        w->len--;
        if (gw[m - 2] == 0) mr_lzero(w);
    }
}

/* Use karatsuba to multiply two polynomials with coefficients in GF(2^m) */

#ifndef MR_STATIC

void karmul2_poly(_MIPD_ int n, big* t, big* x, big* y, big* z)
{
    int m, nd2, nd, md, md2;
    if (n == 1) { /* finished */
        modmult2(_MIPP_ * x, *y, *z);
        zero(z[1]);
        return;
    }
    if (n == 2) { /* in-line 2x2 */
        modmult2(_MIPP_ x[0], y[0], z[0]);
        modmult2(_MIPP_ x[1], y[1], z[2]);
        add2(x[0], x[1], t[0]);
        add2(y[0], y[1], t[1]);
        modmult2(_MIPP_ t[0], t[1], z[1]);
        add2(z[1], z[0], z[1]);
        add2(z[1], z[2], z[1]);
        zero(z[3]);
        return;
    }

    if (n == 3) {
        modmult2(_MIPP_ x[0], y[0], z[0]);
        modmult2(_MIPP_ x[1], y[1], z[2]);
        modmult2(_MIPP_ x[2], y[2], z[4]);
        add2(x[0], x[1], t[0]);
        add2(y[0], y[1], t[1]);
        modmult2(_MIPP_ t[0], t[1], z[1]);
        add2(z[1], z[0], z[1]);
        add2(z[1], z[2], z[1]);
        add2(x[1], x[2], t[0]);
        add2(y[1], y[2], t[1]);
        modmult2(_MIPP_ t[0], t[1], z[3]);
        add2(z[3], z[2], z[3]);
        add2(z[3], z[4], z[3]);
        add2(x[0], x[2], t[0]);
        add2(y[0], y[2], t[1]);
        modmult2(_MIPP_ t[0], t[1], t[0]);
        add2(z[2], t[0], z[2]);
        add2(z[2], z[0], z[2]);
        add2(z[2], z[4], z[2]);
        zero(z[5]);
        return;
    }

    if (n % 2 == 0) {
        md = nd = n;
        md2 = nd2 = n / 2;
    } else {
        nd  = n + 1;
        md  = n - 1;
        nd2 = nd / 2;
        md2 = md / 2;
    }

    for (m = 0; m < nd2; m++) {
        copy(x[m], z[m]);
        copy(y[m], z[nd2 + m]);
    }
    for (m = 0; m < md2; m++) {
        add2(z[m], x[nd2 + m], z[m]);
        add2(z[nd2 + m], y[nd2 + m], z[nd2 + m]);
    }

    karmul2_poly(_MIPP_ nd2, &t[nd], z, &z[nd2], t);

    karmul2_poly(_MIPP_ nd2, &t[nd], x, y, z);

    for (m = 0; m < nd; m++) add2(t[m], z[m], t[m]);

    karmul2_poly(_MIPP_ md2, &t[nd], &x[nd2], &y[nd2], &z[nd]);

    for (m = 0; m < md; m++) add2(t[m], z[nd + m], t[m]);
    for (m = 0; m < nd; m++) add2(z[nd2 + m], t[m], z[nd2 + m]);
}

void karmul2_poly_upper(_MIPD_ int n, big* t, big* x, big* y, big* z)
{ /* n is large and even, and upper half of z is known already */
    int m, nd2, nd;
    nd2 = n / 2;
    nd  = n;

    for (m = 0; m < nd2; m++) {
        add2(x[m], x[nd2 + m], z[m]);
        add2(y[m], y[nd2 + m], z[nd2 + m]);
    }

    karmul2_poly(_MIPP_ nd2, &t[nd], z, &z[nd2], t);

    karmul2_poly(_MIPP_ nd2, &t[nd], x, y, z); /* only 2 karmuls needed! */

    for (m = 0; m < nd; m++) add2(t[m], z[m], t[m]);

    for (m = 0; m < nd2; m++) {
        add2(z[nd + m], z[nd + nd2 + m], z[nd + m]);
        add2(z[nd + m], t[nd2 + m], z[nd + m]);
    }

    for (m = 0; m < nd; m++) {
        add2(t[m], z[nd + m], t[m]);
        add2(z[nd2 + m], t[m], z[nd2 + m]);
    }
}

#endif

/* Some in-line karatsuba down at the bottom... */

#ifndef MR_COMBA2

static void mr_bottom2(mr_small* x, mr_small* y, mr_small* z)
{
    mr_small q0, r0, q1, r1, q2, r2;

    q0 = mr_mul2(x[0], y[0], &r0);
    q1 = mr_mul2(x[1], y[1], &r1);
    q2 = mr_mul2((mr_small)(x[0] ^ x[1]), (mr_small)(y[0] ^ y[1]), &r2);

    z[0] = r0;
    z[1] = q0 ^ r1 ^ r0 ^ r2;
    z[2] = q0 ^ r1 ^ q1 ^ q2;
    z[3] = q1;
}

static void mr_bottom3(mr_small* x, mr_small* y, mr_small* z)
{ /* just 6 mr_muls... */
    mr_small q0, r0, q1, r1, q2, r2;
    mr_small a0, b0, a1, b1, a2, b2;

    q0 = mr_mul2(x[0], y[0], &r0);
    q1 = mr_mul2(x[1], y[1], &r1);
    q2 = mr_mul2(x[2], y[2], &r2);

    a0 = mr_mul2((mr_small)(x[0] ^ x[1]), (mr_small)(y[0] ^ y[1]), &b0);
    a1 = mr_mul2((mr_small)(x[1] ^ x[2]), (mr_small)(y[1] ^ y[2]), &b1);
    a2 = mr_mul2((mr_small)(x[0] ^ x[2]), (mr_small)(y[0] ^ y[2]), &b2);

    b0 ^= r0 ^ r1;
    a0 ^= q0 ^ q1;
    b1 ^= r1 ^ r2;
    a1 ^= q1 ^ q2;
    b2 ^= r0 ^ r2;
    a2 ^= q0 ^ q2;

    z[0] = r0;
    z[1] = q0 ^ b0;
    z[2] = r1 ^ a0 ^ b2;
    z[3] = q1 ^ b1 ^ a2;
    z[4] = r2 ^ a1;
    z[5] = q2;
}

static void mr_bottom4(mr_small* x, mr_small* y, mr_small* z)
{ /* unwound 4x4 karatsuba multiplication - only 9 muls */
    mr_small q0, r0, q1, r1, q2, r2, tx, ty;
    mr_small t0, t1, t2, t3;
    mr_small z0, z1, z2, z3, z4, z5, z6, z7;
    mr_small x0, x1, x2, x3, y0, y1, y2, y3;

    x0 = x[0];
    x1 = x[1];
    x2 = x[2];
    x3 = x[3];
    y0 = y[0];
    y1 = y[1];
    y2 = y[2];
    y3 = y[3];

    q0 = mr_mul2(x0, y0, &r0);
    q1 = mr_mul2(x1, y1, &r1);

    tx = x0 ^ x1;
    ty = y0 ^ y1;
    q2 = mr_mul2(tx, ty, &r2);

    z0 = r0;
    q0 ^= r1;
    z1 = q0 ^ r0 ^ r2;
    z2 = q0 ^ q1 ^ q2;
    z3 = q1;

    q0 = mr_mul2(x2, y2, &r0);

    q1 = mr_mul2(x3, y3, &r1);

    tx = x2 ^ x3;
    ty = y2 ^ y3;
    q2 = mr_mul2(tx, ty, &r2);

    z4 = r0;
    q0 ^= r1;
    z5 = q0 ^ r0 ^ r2;
    z6 = q0 ^ q1 ^ q2;
    z7 = q1;

    x2 ^= x0;
    y2 ^= y0;
    q0 = mr_mul2(x2, y2, &r0);

    x3 ^= x1;
    y3 ^= y1;
    q1 = mr_mul2(x3, y3, &r1);

    x2 ^= x3;
    y2 ^= y3;
    q2 = mr_mul2(x2, y2, &r2);

    q0 ^= r1;
    t0 = z0 ^ z4 ^ r0;
    t1 = z1 ^ z5 ^ q0 ^ r0 ^ r2;
    t2 = z2 ^ z6 ^ q0 ^ q1 ^ q2;
    t3 = z3 ^ z7 ^ q1;

    z2 ^= t0;
    z3 ^= t1;
    z4 ^= t2;
    z5 ^= t3;

    z[0] = z0;
    z[1] = z1;
    z[2] = z2;
    z[3] = z3;
    z[4] = z4;
    z[5] = z5;
    z[6] = z6;
    z[7] = z7;
}

static void mr_bottom5(mr_small* x, mr_small* y, mr_small* z)
{ /* Montgomery's 5x5 formula. Requires 13 mr_muls.... */
    mr_small u0, v0, u1, v1, u2, v2, s0, t0, q, r, t;
    mr_small z1, z2, z3, z4, z5, z6, z7, z8;
    mr_small x0, x1, x2, x3, x4, y0, y1, y2, y3, y4;

    x0 = x[0];
    x1 = x[1];
    x2 = x[2];
    x3 = x[3];
    x4 = x[4];
    y0 = y[0];
    y1 = y[1];
    y2 = y[2];
    y3 = y[3];
    y4 = y[4];

    q    = mr_mul2(x0, y0, &r);
    t    = q ^ r;
    z[0] = r;
    z1   = t;
    z2   = t;
    z3   = q;
    z4   = r;
    z5   = t;
    z6   = t;
    z7   = q;
    q    = mr_mul2(x1, y1, &r);
    z1 ^= r;
    z2 ^= q;
    z4 ^= r;
    z5 ^= q;
    q = mr_mul2(x3, y3, &r);
    z4 ^= r;
    z5 ^= q;
    z7 ^= r;
    z8 = q;
    q  = mr_mul2(x4, y4, &r);
    t  = q ^ r;
    z2 ^= r;
    z3 ^= t;
    z4 ^= t;
    z5 ^= q;
    z6 ^= r;
    z7 ^= t;
    z8 ^= t;
    z[9] = q;

    u0 = x0 ^ x4; /* u0=a0+a4 */
    v0 = y0 ^ y4; /* v0=b0+b4 */
    q  = mr_mul2(u0, v0, &r);
    t  = q ^ r;
    z2 ^= r;
    z3 ^= t;
    z4 ^= q;
    z5 ^= r;
    z6 ^= t;
    z7 ^= q;

    u1 = x0 ^ x1; /* u1=a0+a1 */
    v1 = y0 ^ y1; /* v1=b0+b1 */
    q  = mr_mul2(u1, v1, &r);
    t  = q ^ r;
    z1 ^= r;
    z2 ^= t;
    z3 ^= q;
    z4 ^= r;
    z5 ^= t;
    z6 ^= q;

    u2 = x3 ^ x4; /* u2=a3+a4 */
    v2 = y3 ^ y4; /* v2=b3+b4 */
    q  = mr_mul2(u2, v2, &r);
    t  = q ^ r;
    z3 ^= r;
    z4 ^= t;
    z5 ^= q;
    z6 ^= r;
    z7 ^= t;
    z8 ^= q;

    /*u=u1^u2;   u=a0+a1+a3+a4 */
    /*v=v1^v2;    v=b0+b1+b3+b4 */
    q = mr_mul2(u1 ^ u2, v1 ^ v2, &r);
    z3 ^= r;
    z4 ^= q;
    z5 ^= r;
    z6 ^= q;

    s0 = u1 ^ x2; /* s0=a0+a1+a2 */
    t0 = v1 ^ y2; /* t0=b0+b1+b2 */
    u2 ^= x2;     /* u2=a2+a3+a4 */
    v2 ^= y2;     /* v2=b2+b3+b4 */
    u1 ^= u2;     /* u1=a0+a1+a2+a3+a4 */
    v1 ^= v2;     /* v1=b0+b1+b2+b3+b4 */
    q = mr_mul2(u1, v1, &r);
    t = q ^ r;
    z3 ^= r;
    z4 ^= t;
    z5 ^= t;
    z6 ^= q;

    u2 ^= x0; /* s1=a0+a2+a3+a4 */
    v2 ^= y0; /* t1=b0+b2+b3+b4 */
    q = mr_mul2(u2, v2, &r);
    z3 ^= r;
    z4 ^= q;
    z6 ^= r;
    z7 ^= q;

    s0 ^= x4; /* s0=a0+a1+a2+a4 */
    t0 ^= y4; /* t0=b0+b1+b2+b4 */
    q = mr_mul2(s0, t0, &r);
    z2 ^= r;
    z3 ^= q;
    z5 ^= r;
    z6 ^= q;

    u2 ^= x4; /* u2=a0+a2+a3 */
    v2 ^= y4; /* v2=b0+b2+b3 */
    q = mr_mul2(u2, v2, &r);
    z4 ^= r;
    z5 ^= q;
    z6 ^= r;
    z7 ^= q;

    s0 ^= x0; /* s0=a1+a2+a4 */
    t0 ^= y0; /* t0=b1+b2+b4 */
    q = mr_mul2(s0, t0, &r);
    z2 ^= r;
    z3 ^= q;
    z4 ^= r;
    z5 ^= q;

    z[1] = z1;
    z[2] = z2;
    z[3] = z3;
    z[4] = z4;
    z[5] = z5;
    z[6] = z6;
    z[7] = z7;
    z[8] = z8;
}

void karmul2(int n, mr_small* t, mr_small* x, mr_small* y, mr_small* z)
{ /* Karatsuba multiplication - note that n can be odd or even */
    int m, nd2, nd, md, md2;

    if (n <= 5) {
        if (n == 1) {
            z[1] = mr_mul2(x[0], y[0], &z[0]);
            return;
        }
        if (n == 2) {
            mr_bottom2(x, y, z);
            return;
        }
        if (n == 3) {
            mr_bottom3(x, y, z);
            return;
        }
        if (n == 4) {
            mr_bottom4(x, y, z);
            return;
        }
        if (n == 5) {
            mr_bottom5(x, y, z);
            return;
        }
    }
    if (n % 2 == 0) {
        md = nd = n;
        md2 = nd2 = n / 2;
    } else {
        nd  = n + 1;
        md  = n - 1;
        nd2 = nd / 2;
        md2 = md / 2;
    }

    for (m = 0; m < nd2; m++) {
        z[m]       = x[m];
        z[nd2 + m] = y[m];
    }
    for (m = 0; m < md2; m++) {
        z[m] ^= x[nd2 + m];
        z[nd2 + m] ^= y[nd2 + m];
    }

    karmul2(nd2, &t[nd], z, &z[nd2], t);
    karmul2(nd2, &t[nd], x, y, z);

    for (m = 0; m < nd; m++) t[m] ^= z[m];

    karmul2(md2, &t[nd], &x[nd2], &y[nd2], &z[nd]);

    for (m = 0; m < md; m++) t[m] ^= z[nd + m];
    for (m = 0; m < nd; m++) z[nd2 + m] ^= t[m];
}

#endif

/* this is time-critical, so use karatsuba here, since addition is cheap *
 * and easy (no carries to worry about...)                               */

/* #define CLAIRE */

void multiply2(_MIPD_ big x, big y, big w)
{
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

#ifdef MR_COMBA2
    comba_mult2(_MIPP_ x, y, w);
    return;
#else
    int      i, j, xl, yl, ml;
#ifdef CLAIRE
    int      d;
    mr_small hi, lo;
#endif
    mr_small p, q;

    big w0 = mr_mip->w0;

    if (x == NULL || y == NULL) {
        zero(w);
        return;
    }
    if (x->len == 0 || y->len == 0) {
        zero(w);
        return;
    }

    xl = x->len;
    yl = y->len;
    zero(w0);

#ifdef CLAIRE

    /* Comba method */

    w0->len = xl + yl;
    d       = 1 + mr_mip->M / MIRACL;
    hi = lo = 0;
    for (i = 0; i < d; i++) {
        for (j = 0; j <= i; j++) {
            q = mr_mul2(x->w[j], y->w[i - j], &p);
            hi ^= q;
            lo ^= p;
        }
        w0->w[i] = lo;
        lo       = hi;
        hi       = 0;
    }
    for (i = d; i < 2 * d - 1; i++) {
        for (j = i - d + 1; j < d; j++) {
            q = mr_mul2(x->w[j], y->w[i - j], &p);
            hi ^= q;
            lo ^= p;
        }
        w0->w[i] = lo;
        lo       = hi;
        hi       = 0;
    }
    w0->w[2 * d - 1] = lo;
    mr_lzero(w0);
    copy(w0, w);

#else

    /* recommended method as mr_mul2 is so slow... */

    if (xl >= MR_KARATSUBA && yl >= MR_KARATSUBA) {
        if (xl > yl)
            ml = xl;
        else
            ml = yl;

        karmul2(ml, mr_mip->w7->w, x->w, y->w, w0->w);

        mr_mip->w7->len = w0->len = 2 * ml + 1;
        mr_lzero(w0);
        mr_lzero(mr_mip->w7);
        copy(w0, w);
        return;
    }

    w0->len = xl + yl;
    for (i = 0; i < xl; i++) {
        for (j = 0; j < yl; j++) {
            q = mr_mul2(x->w[i], y->w[j], &p);
            w0->w[i + j] ^= p;
            w0->w[i + j + 1] ^= q;
        }
    }
    mr_lzero(w0);
    copy(w0, w);
#endif
#endif
}

void add2(big x, big y, big z)
{ /* XOR x and y */
    int       i, lx, ly, lz, lm;
    mr_small *gx, *gy, *gz;

    if (x == y) {
        zero(z);
        return;
    }
    if (y == NULL) {
        copy(x, z);
        return;
    } else if (x == NULL) {
        copy(y, z);
        return;
    }

    if (x == z) {
        gy = y->w;
        gz = z->w;
        ly = y->len;
        lz = z->len;
        lm = lz;
        if (ly > lz) lm = ly;
        for (i = 0; i < lm; i++) gz[i] ^= gy[i];
        z->len = lm;
        if (gz[lm - 1] == 0) mr_lzero(z);
    } else {
        gx = x->w;
        gy = y->w;
        gz = z->w;
        lx = x->len;
        ly = y->len;
        lz = z->len;
        lm = lx;
        if (ly > lx) lm = ly;

        for (i = 0; i < lm; i++) gz[i] = gx[i] ^ gy[i];
        for (i = lm; i < lz; i++) gz[i] = 0;
        z->len = lm;
        if (gz[lm - 1] == 0) mr_lzero(z);
    }
}

static void remain2(_MIPD_ big y, big x)
{ /* generic "remainder" program. x%=y */
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    int my = numbits(y);
    int mx = numbits(x);
    while (mx >= my) {
        copy(y, mr_mip->w7);
        shiftleftbits(mr_mip->w7, mx - my);
        add2(x, mr_mip->w7, x);
        mx = numbits(x);
    }
    return;
}

void gcd2(_MIPD_ big x, big y, big g)
{
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    if (size(y) == 0) {
        copy(x, g);
        return;
    }
    copy(x, mr_mip->w1);
    copy(y, mr_mip->w2);
    forever
    {
        remain2(_MIPP_ mr_mip->w2, mr_mip->w1);
        if (size(mr_mip->w1) == 0) break;
        copy(mr_mip->w1, mr_mip->w3);
        copy(mr_mip->w2, mr_mip->w1);
        copy(mr_mip->w3, mr_mip->w2);
    }
    copy(mr_mip->w2, g);
}

/* See "Elliptic Curves in Cryptography", Blake, Seroussi & Smart,
   Cambridge University Press, 1999, page 20, for this fast reduction
   routine - algorithm II.9 */

void reduce2(_MIPD_ big y, big x)
{ /* reduction wrt the trinomial or pentanomial modulus        *
   * Note that this is linear O(n), and thus not time critical */
    int      k1, k2, k3, k4, ls1, ls2, ls3, ls4, rs1, rs2, rs3, rs4, i;
    int      M, A, B, C;
    int      xl;
    mr_small top, *gx, w;

#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

    if (x != y) copy(y, x);
    xl = x->len;
    gx = x->w;

    M = mr_mip->M;
    A = mr_mip->AA;
    if (A == 0) {
        mr_berror(_MIPP_ MR_ERR_NO_BASIS);
        return;
    }
    B = mr_mip->BB;
    C = mr_mip->CC;

    /* If optimizing agressively it makes sense to make this code specific to a
       particular field For example code like this can be optimized for the case
       m=163. Note that the general purpose code involves lots of branches -
       these cause breaks in the pipeline and they are slow. Further loop
       unrolling would be even faster...

       Version 5.10 - optimal code for 32-bit processors and for some NIST
       curves added Version 5.22 - some code for a 16-bit processor..

       Version 5.23 - Use findbase.cpp to find "best" irreducible polynomial
       Version 5.23 - Use utility irp.cpp to automatically generate optimal code
       for insertion here
    */

#if MIRACL == 8

    if (M == 163 && A == 7 && B == 6 && C == 3) {
        for (i = xl - 1; i >= 21; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 19] ^= (w >> 4) ^ (w >> 5);
            gx[i - 20] ^= (w >> 3) ^ (w << 4) ^ (w << 3) ^ w;
            gx[i - 21] ^= (w << 5);
        } /* XORs= 7 shifts= 6 */
        top = gx[20] >> 3;
        gx[0] ^= top;
        top <<= 3;
        gx[0] ^= (top << 4) ^ (top << 3) ^ top;
        gx[1] ^= (top >> 4) ^ (top >> 5);
        gx[20] ^= top;
        x->len = 21;
        if (gx[20] == 0) mr_lzero(x);
        return;
    }

    if (M == 271 && A == 201) {
        for (i = xl - 1; i >= 34; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 8] ^= (w >> 6);
            gx[i - 9] ^= (w << 2);
            gx[i - 33] ^= (w >> 7);
            gx[i - 34] ^= (w << 1);
        } /* XORs= 4 shifts= 4 */
        top = gx[33] >> 7;
        gx[0] ^= top;
        top <<= 7;
        gx[24] ^= (top << 2);
        gx[25] ^= (top >> 6);
        gx[33] ^= top;
        x->len = 34;
        if (gx[33] == 0) mr_lzero(x);
        return;
    }

    if (M == 271 && A == 207 && B == 175 && C == 111) {
        for (i = xl - 1; i >= 34; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 8] ^= w;
            gx[i - 12] ^= w;
            gx[i - 20] ^= w;
            gx[i - 33] ^= (w >> 7);
            gx[i - 34] ^= (w << 1);
        } /* XORs= 5 shifts= 2 */
        top = gx[33] >> 7;
        gx[0] ^= top;
        top <<= 7;
        gx[13] ^= top;
        gx[21] ^= top;
        gx[25] ^= top;
        gx[33] ^= top;
        x->len = 34;
        if (gx[33] == 0) mr_lzero(x);
        return;
    }

#endif

#if MIRACL == 16

    if (M == 163 && A == 7 && B == 6 && C == 3) {
        for (i = xl - 1; i >= 11; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 10] ^= (w >> 3) ^ (w << 3) ^ (w << 4) ^ w;
            gx[i - 11] ^= (w << 13);
            gx[i - 9] ^= (w >> 12) ^ (w >> 13);
        }
        top = gx[10] >> 3;
        gx[0] ^= top;
        top <<= 3;

        gx[1] ^= (top >> 12) ^ (top >> 13);
        gx[0] ^= (top << 4) ^ (top << 3) ^ top;

        gx[10] ^= top;
        x->len = 11;
        if (gx[10] == 0) mr_lzero(x);

        return;
    }

    if (M == 271 && A == 201 && B == 0) {
        for (i = xl - 1; i >= 17; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 17] ^= (w << 1);
            gx[i - 16] ^= (w >> 15);
            gx[i - 5] ^= (w << 10);
            gx[i - 4] ^= (w >> 6);
        }
        top = gx[16] >> 15;
        gx[0] ^= top;
        top <<= 15;
        gx[12] ^= (top >> 6);
        gx[11] ^= (top << 10);

        gx[16] ^= top;
        x->len = 17;
        if (gx[16] == 0) mr_lzero(x);

        return;
    }

    if (M == 271 && A == 207 && B == 175 && C == 111) {
        for (i = xl - 1; i >= 17; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 4] ^= w;
            gx[i - 6] ^= w;
            gx[i - 10] ^= w;
            gx[i - 16] ^= (w >> 15);
            gx[i - 17] ^= (w << 1);
        } /* XORs= 5 shifts= 2 */
        top = gx[16] >> 15;
        gx[0] ^= top;
        top <<= 15;
        gx[6] ^= top;
        gx[10] ^= top;
        gx[12] ^= top;
        gx[16] ^= top;
        x->len = 17;
        if (gx[16] == 0) mr_lzero(x);
        return;
    }

#endif

#if MIRACL == 32

    if (M == 127 && A == 63) {
        for (i = xl - 1; i >= 4; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 2] ^= w;
            gx[i - 3] ^= (w >> 31);
            gx[i - 4] ^= (w << 1);
        } /* XORs= 3 shifts= 2 */
        top = gx[3] >> 31;
        gx[0] ^= top;
        top <<= 31;
        gx[1] ^= top;
        gx[3] ^= top;
        x->len = 4;
        if (gx[3] == 0) mr_lzero(x);
        return;
    }

    if (M == 163 && A == 7 && B == 6 && C == 3) {
        for (i = xl - 1; i >= 6; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 5] ^= ((w >> 3) ^ (w << 4) ^ (w << 3) ^ w);
            gx[i - 6] ^= (w << 29);
            gx[i - 4] ^= ((w >> 28) ^ (w >> 29));
        }
        top = gx[5] >> 3;

        gx[0] ^= top;
        top <<= 3;
        gx[1] ^= (top >> 28) ^ (top >> 29);
        gx[0] ^= top ^ (top << 4) ^ (top << 3);

        gx[5] ^= top;

        x->len = 6;
        if (gx[5] == 0) mr_lzero(x);

        return;
    }

    if (M == 163 && A == 99 && B == 97 && C == 3) {
        for (i = xl - 1; i >= 6; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 2] ^= w ^ (w >> 2);
            gx[i - 3] ^= (w << 30);
            gx[i - 5] ^= (w >> 3) ^ w;
            gx[i - 6] ^= (w << 29);
        }
        top = gx[5] >> 3;
        gx[0] ^= top;
        top <<= 3;
        gx[0] ^= top;
        gx[2] ^= (top << 30);
        gx[3] ^= top ^ (top >> 2);
        gx[5] ^= top;
        x->len = 6;
        if (gx[5] == 0) mr_lzero(x);
        return;
    }

    if (M == 233 && A == 74 && B == 0) {
        for (i = xl - 1; i >= 8; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 8] ^= (w << 23);
            gx[i - 7] ^= (w >> 9);
            gx[i - 5] ^= (w << 1);
            gx[i - 4] ^= (w >> 31);
        }
        top = gx[7] >> 9;
        gx[0] ^= top;
        gx[2] ^= (top << 10);
        gx[3] ^= (top >> 22);
        gx[7] &= 0x1FF;

        x->len = 8;
        if (gx[7] == 0) mr_lzero(x);
        return;
    }

    if (M == 233 && A == 159 && B == 0) {
        for (i = xl - 1; i >= 8; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 2] ^= (w >> 10);
            gx[i - 3] ^= (w << 22);
            gx[i - 7] ^= (w >> 9);
            gx[i - 8] ^= (w << 23);
        }
        top = gx[7] >> 9;
        gx[0] ^= top;
        top <<= 9;
        gx[4] ^= (top << 22);
        gx[5] ^= (top >> 10);
        gx[7] ^= top;
        x->len = 8;
        if (gx[7] == 0) mr_lzero(x);
        return;
    }

    if (M == 233 && A == 201 && B == 105 && C == 9) {
        for (i = xl - 1; i >= 8; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 1] ^= w;
            gx[i - 4] ^= w;
            gx[i - 7] ^= (w >> 9) ^ w;
            gx[i - 8] ^= (w << 23);
        }
        top = gx[7] >> 9;
        gx[0] ^= top;
        top <<= 9;
        gx[0] ^= top;
        gx[3] ^= top;
        gx[6] ^= top;
        gx[7] ^= top;
        x->len = 8;
        if (gx[7] == 0) mr_lzero(x);

        return;
    }

    if (M == 103 && A == 9 && B == 0) {
        for (i = xl - 1; i >= 4; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 3] ^= ((w >> 7) ^ (w << 2));
            gx[i - 4] ^= (w << 25);
            gx[i - 2] ^= (w >> 30);
        }
        top = gx[3] >> 7;
        gx[0] ^= top;
        top <<= 7;
        gx[1] ^= (top >> 30);
        gx[0] ^= (top << 2);
        gx[3] ^= top;
        x->len = 4;
        if (gx[3] == 0) mr_lzero(x);

        return;
    }

    if (M == 283 && A == 12 && B == 7 && C == 5) {
        for (i = xl - 1; i >= 9; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 9] ^= (w << 5) ^ (w << 10) ^ (w << 12) ^ (w << 17);
            gx[i - 8] ^= (w >> 27) ^ (w >> 22) ^ (w >> 20) ^ (w >> 15);
        }

        top = gx[8] >> 27;
        gx[0] ^= top ^ (top << 5) ^ (top << 7) ^ (top << 12);
        gx[8] &= 0x7FFFFFF;

        x->len = 9;
        if (gx[8] == 0) mr_lzero(x);
        return;
    }

    if (M == 283 && A == 249 && B == 219 && C == 27) {
        for (i = xl - 1; i >= 9; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 1] ^= (w >> 2);
            gx[i - 2] ^= (w << 30) ^ w;
            gx[i - 8] ^= (w >> 27) ^ w;
            gx[i - 9] ^= (w << 5);
        } /* XORs= 6 shifts= 4 */
        top = gx[8] >> 27;
        gx[0] ^= top;
        top <<= 27;
        gx[0] ^= top;
        gx[6] ^= (top << 30) ^ top;
        gx[7] ^= (top >> 2);
        gx[8] ^= top;
        x->len = 9;
        if (gx[8] == 0) mr_lzero(x);
        return;
    }

    if (M == 313 && A == 121 && B == 0) {
        for (i = xl - 1; i >= 10; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 6] ^= w;
            gx[i - 9] ^= (w >> 25);
            gx[i - 10] ^= (w << 7);
        }
        top = gx[9] >> 25;
        gx[0] ^= top;
        top <<= 25;
        gx[3] ^= top;
        gx[9] ^= top;
        x->len = 10;
        if (gx[9] == 0) mr_lzero(x);
        return;
    }

    if (M == 379 && A == 253 && B == 251 && C == 59) {
        for (i = xl - 1; i >= 12; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 3] ^= (w >> 30);
            gx[i - 4] ^= (w << 2) ^ w;
            gx[i - 10] ^= w;
            gx[i - 11] ^= (w >> 27);
            gx[i - 12] ^= (w << 5);
        } /* XORs= 6 shifts= 4 */
        top = gx[11] >> 27;
        gx[0] ^= top;
        top <<= 27;
        gx[1] ^= top;
        gx[7] ^= (top << 2) ^ top;
        gx[8] ^= (top >> 30);
        gx[11] ^= top;
        x->len = 12;
        if (gx[11] == 0) mr_lzero(x);
        return;
    }

    if (M == 571 && A == 10 && B == 5 && C == 2) {
        for (i = xl - 1; i >= 18; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 18] ^= (w << 5) ^ (w << 7) ^ (w << 10) ^ (w << 15);
            gx[i - 17] ^= (w >> 27) ^ (w >> 25) ^ (w >> 22) ^ (w >> 17);
        }

        top = gx[17] >> 27;
        gx[0] ^= top ^ (top << 2) ^ (top << 5) ^ (top << 10);
        gx[17] &= 0x7FFFFFF;

        x->len = 18;
        if (gx[17] == 0) mr_lzero(x);

        return;
    }

    if (M == 571 && A == 507 && B == 475 && C == 417) {
        for (i = xl - 1; i >= 18; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 2] ^= w;
            gx[i - 3] ^= w;
            gx[i - 4] ^= (w >> 26);
            gx[i - 5] ^= (w << 6);
            gx[i - 17] ^= (w >> 27);
            gx[i - 18] ^= (w << 5);
        } /* XORs= 6 shifts= 4 */
        top = gx[17] >> 27;
        gx[0] ^= top;
        top <<= 27;
        gx[12] ^= (top << 6);
        gx[13] ^= (top >> 26);
        gx[14] ^= top;
        gx[15] ^= top;
        gx[17] ^= top;
        x->len = 18;
        if (gx[17] == 0) mr_lzero(x);
        return;
    }

    if (M == 1223 && A == 255) {
        for (i = xl - 1; i >= 39; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 30] ^= (w >> 8);
            gx[i - 31] ^= (w << 24);
            gx[i - 38] ^= (w >> 7);
            gx[i - 39] ^= (w << 25);
        } /* XORs= 4 shifts= 4 */
        top = gx[38] >> 7;
        gx[0] ^= top;
        top <<= 7;
        gx[7] ^= (top << 24);
        gx[8] ^= (top >> 8);
        gx[38] ^= top;
        x->len = 39;
        if (gx[38] == 0) mr_lzero(x);
        return;
    }

#endif

#if MIRACL == 64
    if (M == 1223 && A == 255) {
        for (i = xl - 1; i >= 20; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 15] ^= (w >> 8);
            gx[i - 16] ^= (w << 56);
            gx[i - 19] ^= (w >> 7);
            gx[i - 20] ^= (w << 57);
        }
        top = gx[19] >> 7;
        gx[0] ^= top;
        top <<= 7;
        gx[3] ^= (top << 56);
        gx[4] ^= (top >> 8);
        gx[19] ^= top;
        x->len = 20;
        if (gx[19] == 0) mr_lzero(x);
        return;
    }

    if (M == 379 && A == 253 && B == 251 && C == 59) {
        for (i = xl - 1; i >= 6; i--) {
            w     = gx[i];
            gx[i] = 0;
            gx[i - 1] ^= (w >> 62);
            gx[i - 2] ^= (w << 2) ^ w;
            gx[i - 5] ^= (w >> 59) ^ w;
            gx[i - 6] ^= (w << 5);
        } /* XORs= 6 shifts= 4 */
        top = gx[5] >> 59;
        gx[0] ^= top;
        top <<= 59;
        gx[0] ^= top;
        gx[3] ^= (top << 2) ^ top;
        gx[4] ^= (top >> 62);
        gx[5] ^= top;
        x->len = 6;
        if (gx[5] == 0) mr_lzero(x);
        return;
    }
#endif

    k3 = k4 = rs3 = ls3 = rs4 = ls4 = 0;
    k1 = 1 + M / MIRACL; /* words from MSB to LSB */

    if (xl <= k1) {
        if (numbits(x) <= M) return;
    }

    rs1 = M % MIRACL;
    ls1 = MIRACL - rs1;

    if (M - A < MIRACL) { /* slow way */
        while (numbits(x) >= M + 1) {
            copy(mr_mip->modulus, mr_mip->w7);
            shiftleftbits(mr_mip->w7, numbits(x) - M - 1);
            add2(x, mr_mip->w7, x);
        }
        return;
    }

    k2  = 1 + (M - A) / MIRACL; /* words from MSB to bit */
    rs2 = (M - A) % MIRACL;
    ls2 = MIRACL - rs2;

    if (B) { /* Pentanomial */
        k3  = 1 + (M - B) / MIRACL;
        rs3 = (M - B) % MIRACL;
        ls3 = MIRACL - rs3;

        k4  = 1 + (M - C) / MIRACL;
        rs4 = (M - C) % MIRACL;
        ls4 = MIRACL - rs4;
    }

    for (i = xl - 1; i >= k1; i--) {
        w     = gx[i];
        gx[i] = 0;
        if (rs1 == 0)
            gx[i - k1 + 1] ^= w;
        else {
            gx[i - k1 + 1] ^= (w >> rs1);
            gx[i - k1] ^= (w << ls1);
        }
        if (rs2 == 0)
            gx[i - k2 + 1] ^= w;
        else {
            gx[i - k2 + 1] ^= (w >> rs2);
            gx[i - k2] ^= (w << ls2);
        }
        if (B) {
            if (rs3 == 0)
                gx[i - k3 + 1] ^= w;
            else {
                gx[i - k3 + 1] ^= (w >> rs3);
                gx[i - k3] ^= (w << ls3);
            }
            if (rs4 == 0)
                gx[i - k4 + 1] ^= w;
            else {
                gx[i - k4 + 1] ^= (w >> rs4);
                gx[i - k4] ^= (w << ls4);
            }
        }
    }

    top = gx[k1 - 1] >> rs1;

    if (top != 0) {
        gx[0] ^= top;
        top <<= rs1;

        if (rs2 == 0)
            gx[k1 - k2] ^= top;
        else {
            gx[k1 - k2] ^= (top >> rs2);
            if (k1 > k2) gx[k1 - k2 - 1] ^= (top << ls2);
        }
        if (B) {
            if (rs3 == 0)
                gx[k1 - k3] ^= top;
            else {
                gx[k1 - k3] ^= (top >> rs3);
                if (k1 > k3) gx[k1 - k3 - 1] ^= (top << ls3);
            }
            if (rs4 == 0)
                gx[k1 - k4] ^= top;
            else {
                gx[k1 - k4] ^= (top >> rs4);
                if (k1 > k4) gx[k1 - k4 - 1] ^= (top << ls4);
            }
        }
        gx[k1 - 1] ^= top;
    }
    x->len = k1;
    if (gx[k1 - 1] == 0) mr_lzero(x);
}

void incr2(big x, int n, big w)
{ /* increment x by small amount */
    if (x != w) copy(x, w);
    if (n == 0) return;
    if (w->len == 0) {
        w->len  = 1;
        w->w[0] = n;
    } else {
        w->w[0] ^= (mr_small)n;
        if (w->len == 1 && w->w[0] == 0) w->len = 0;
    }
}

void modsquare2(_MIPD_ big x, big w)
{ /* w=x*x mod f */
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

    square2(x, mr_mip->w0);
    reduce2(_MIPP_ mr_mip->w0, mr_mip->w0);
    copy(mr_mip->w0, w);
}

/* Experimental code for GF(2^103) modular multiplication *
 * Inspired by Robert Harley's ECDL code                  */

#ifdef SP103

#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif

void modmult2(_MIPD_ big x, big y, big w)
{
    int      i, j;
    mr_small b;

    __m128i t[16];
    __m128i m, r, s, p, q, xe, xo;
    __m64   a3, a2, a1, a0, top;

    if (x == y) {
        modsquare2(_MIPP_ x, w);
        return;
    }

    if (x->len == 0 || y->len == 0) {
        zero(w);
        return;
    }

#ifdef MR_COUNT_OPS
    fpm2++;
#endif

    m = _mm_set_epi32(0, 0, 0xff << 24, 0); /* shifting mask */

    /* precompute a small table */

    t[0]  = _mm_set1_epi32(0);
    xe    = _mm_set_epi32(0, x->w[2], 0, x->w[0]);
    xo    = _mm_set_epi32(0, x->w[3], 0, x->w[1]);
    t[1]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[2]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[3]  = _mm_xor_si128(t[2], t[1]);
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[4]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[5]  = _mm_xor_si128(t[4], t[1]);
    t[6]  = _mm_xor_si128(t[4], t[2]);
    t[7]  = _mm_xor_si128(t[4], t[3]);
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[8]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[9]  = _mm_xor_si128(t[8], t[1]);
    t[10] = _mm_xor_si128(t[8], t[2]);
    t[11] = _mm_xor_si128(t[8], t[3]);
    t[12] = _mm_xor_si128(t[8], t[4]);
    t[13] = _mm_xor_si128(t[8], t[5]);
    t[14] = _mm_xor_si128(t[8], t[6]);
    t[15] = _mm_xor_si128(t[8], t[7]);

    b = y->w[0];

    i = b & 0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4); /* net shift left 4 */
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    p = q = r;
    q     = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 1);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 16) & 0xf;
    j = (b >> 20) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 2);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 24) & 0xf;
    j = (b >> 28);
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 3);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    b = y->w[1];

    i = (b)&0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 4);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 5);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 16) & 0xf;
    j = (b >> 20) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 6);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 24) & 0xf;
    j = (b >> 28);
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 7);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    b = y->w[2];

    i = (b)&0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 8);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 9);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 16) & 0xf;
    j = (b >> 20) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 10);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 24) & 0xf;
    j = (b >> 28);
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 11);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    b = y->w[3];

    i = (b)&0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 12);
    p = _mm_xor_si128(p, r);

    q = _mm_srli_si128(q, 4); /* only 103 bits, so we are done */

    /* modular reduction - x^103+x^9+1 */

    a0 = _mm_movepi64_pi64(p);
    a1 = _mm_movepi64_pi64(_mm_srli_si128(p, 8));
    a2 = _mm_movepi64_pi64(q);
    a3 = _mm_movepi64_pi64(_mm_srli_si128(q, 8));

    a2 = _m_pxor(a2, _m_psrlqi(a3, 39));
    a2 = _m_pxor(a2, _m_psrlqi(a3, 30));
    a1 = _m_pxor(a1, _m_psllqi(a3, 25));
    a1 = _m_pxor(a1, _m_psllqi(a3, 34));

    a1 = _m_pxor(a1, _m_psrlqi(a2, 39));
    a1 = _m_pxor(a1, _m_psrlqi(a2, 30));
    a0 = _m_pxor(a0, _m_psllqi(a2, 25));
    a0 = _m_pxor(a0, _m_psllqi(a2, 34));

    top = _m_psrlqi(a1, 39);
    a0  = _m_pxor(a0, top);
    top = _m_psllqi(top, 39);
    a0  = _m_pxor(a0, _m_psrlqi(top, 30));
    a1  = _m_pxor(a1, top);

    if (w->len > 4) zero(w);

    w->w[0] = _m_to_int(a0);
    a0      = _m_psrlqi(a0, 32);
    w->w[1] = _m_to_int(a0);
    w->w[2] = _m_to_int(a1);
    a1      = _m_psrlqi(a1, 32);
    w->w[3] = _m_to_int(a1);

    w->len = 4;
    if (w->w[3] == 0) mr_lzero(w);
    _m_empty();
}

#endif

#ifdef SP79

#ifdef __GNUC__
#include <xmmintrin.h>
#else
#include <emmintrin.h>
#endif

void modmult2(_MIPD_ big x, big y, big w)
{
    int      i, j;
    mr_small b;

    __m128i t[16];
    __m128i m, r, s, p, q, xe, xo;
    __m64   a2, a1, a0, top;

    if (x == y) {
        modsquare2(_MIPP_ x, w);
        return;
    }
#ifdef MR_COUNT_OPS
    fpm2++;
#endif
    if (x->len == 0 || y->len == 0) {
        zero(w);
        return;
    }

    m = _mm_set_epi32(0, 0, 0xff << 24, 0); /* shifting mask */

    /* precompute a small table */

    t[0]  = _mm_set1_epi32(0);
    xe    = _mm_set_epi32(0, x->w[2], 0, x->w[0]);
    xo    = _mm_set_epi32(0, 0, 0, x->w[1]);
    t[1]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[2]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[3]  = _mm_xor_si128(t[2], t[1]);
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[4]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[5]  = _mm_xor_si128(t[4], t[1]);
    t[6]  = _mm_xor_si128(t[4], t[2]);
    t[7]  = _mm_xor_si128(t[4], t[3]);
    xe    = _mm_slli_epi64(xe, 1);
    xo    = _mm_slli_epi64(xo, 1);
    t[8]  = _mm_xor_si128(xe, _mm_slli_si128(xo, 4));
    t[9]  = _mm_xor_si128(t[8], t[1]);
    t[10] = _mm_xor_si128(t[8], t[2]);
    t[11] = _mm_xor_si128(t[8], t[3]);
    t[12] = _mm_xor_si128(t[8], t[4]);
    t[13] = _mm_xor_si128(t[8], t[5]);
    t[14] = _mm_xor_si128(t[8], t[6]);
    t[15] = _mm_xor_si128(t[8], t[7]);

    b = y->w[0];

    i = b & 0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4); /* net shift left 4 */
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    p = q = r;
    q     = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 1);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 16) & 0xf;
    j = (b >> 20) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 2);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 24) & 0xf;
    j = (b >> 28);
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 3);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    b = y->w[1];

    i = (b)&0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 4);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 5);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 16) & 0xf;
    j = (b >> 20) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 6);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 24) & 0xf;
    j = (b >> 28);
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 7);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    b = y->w[2];

    i = (b)&0xf;
    j = (b >> 4) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 8);
    p = _mm_xor_si128(p, r);
    q = _mm_srli_si128(q, 1);

    i = (b >> 8) & 0xf;
    j = (b >> 12) & 0xf;
    r = t[j];
    s = _mm_and_si128(r, m);
    r = _mm_slli_epi64(r, 4);
    s = _mm_slli_si128(s, 1);
    s = _mm_srli_epi64(s, 4);
    r = _mm_xor_si128(r, s);
    r = _mm_xor_si128(r, t[i]);
    q = _mm_xor_si128(q, r);
    r = _mm_slli_si128(r, 9);
    p = _mm_xor_si128(p, r);

    q = _mm_srli_si128(q, 7); /* only 79 bits, so we are done */

    /* modular reduction - x^79+x^9+1 */

    a0 = _mm_movepi64_pi64(p);
    a1 = _mm_movepi64_pi64(_mm_srli_si128(p, 8));
    a2 = _mm_movepi64_pi64(q);

    a1 = _m_pxor(a1, _m_psrlqi(a2, 15));
    a1 = _m_pxor(a1, _m_psrlqi(a2, 6));
    a0 = _m_pxor(a0, _m_psllqi(a2, 49));
    a0 = _m_pxor(a0, _m_psllqi(a2, 58));

    top = _m_psrlqi(a1, 15);
    a0  = _m_pxor(a0, top);
    top = _m_psllqi(top, 15);
    a0  = _m_pxor(a0, _m_psrlqi(top, 6));
    a1  = _m_pxor(a1, top);

    w->w[2] = _m_to_int(a1);

    if (w->len > 3) { /* Yes I know its crazy, but its needed to fix the broken
                         /O2 optimizer */
        for (i = 3; i < w->len; i++) w->w[i] = 0;
    }

    w->w[0] = _m_to_int(a0);
    a0      = _m_psrlqi(a0, 32);
    w->w[1] = _m_to_int(a0);

    w->len = 3;
    if (w->w[2] == 0) mr_lzero(w);
    _m_empty();
}

#endif

#ifndef SP103
#ifndef SP79
/*#ifndef SP271 */

void modmult2(_MIPD_ big x, big y, big w)
{ /* w=x*y mod f */
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

    if (x == NULL || y == NULL) {
        zero(w);
        return;
    }

    if (x == y) {
        modsquare2(_MIPP_ x, w);
        return;
    }

    if (y->len == 0) {
        zero(w);
        return;
    }

    if (y->len == 1) {
        if (y->w[0] == 1) {
            copy(x, w);
            return;
        }
    }

#ifdef MR_COUNT_OPS
    fpm2++;
#endif

    multiply2(_MIPP_ x, y, mr_mip->w0);
    reduce2(_MIPP_ mr_mip->w0, mr_mip->w0);
    copy(mr_mip->w0, w);
}

#endif
#endif
/*#endif*/

/* Will be *much* faster if M,A,(B and C) are all odd */
/* This could/should be optimized for a particular irreducible polynomial and
 * fixed A, B and C */

void sqroot2(_MIPD_ big x, big y)
{
    int i, M, A, B, C;
    int k, n, h, s, a, aw, ab, bw, bb, cw, cb;
#if MIRACL != 32
    int mm, j;
#endif
    mr_small *wk, w, we, wo;
    BOOL      slow;
    /* Using Harley's trick */
    static const mr_small evens[16] = {0, 1, 4,  5,  2,  3,  6,  7,
                                       8, 9, 12, 13, 10, 11, 14, 15};
    static const mr_small odds[16]  = {0, 4, 1, 5, 8,  12, 9,  13,
                                      2, 6, 3, 7, 10, 14, 11, 15};

#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    M = mr_mip->M;
    A = mr_mip->AA;
    if (A == 0) {
        mr_berror(_MIPP_ MR_ERR_NO_BASIS);
        return;
    }
    B = mr_mip->BB;
    C = mr_mip->CC;

    slow = FALSE;
    if (B) {
        if (M % 2 != 1 || A % 2 != 1 || B % 2 != 1 || C % 2 != 1) slow = TRUE;
    } else {
        if (M % 2 != 1 || A % 2 != 1) slow = TRUE;
    }

    if (slow) {
        copy(x, y);
        for (i = 1; i < mr_mip->M; i++) modsquare2(_MIPP_ y, y);
        return;
    }

    bb = cb = cw = bw = 0;
    /* M, A (B and C) are all odd - so use fast
       Fong, Hankerson, Lopez and Menezes method */

    if (x == y) {
        copy(x, mr_mip->w0);
        wk = mr_mip->w0->w;
    } else {
        wk = x->w;
    }
    zero(y);

#if MIRACL == 8
    if (M == 271 && A == 207 && B == 175 && C == 111) {
        y->len = 34;
        for (i = 0; i < 34; i++) {
            n = i / 2;
            w = wk[i];

            we = evens[((w & 0x5) + ((w & 0x50) >> 3))];
            wo = odds[((w & 0xA) + ((w & 0xA0) >> 5))];

            i++;
            w = wk[i];

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 4;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 4;

            y->w[n] ^= we;
            y->w[n + 17] = wo;

            y->w[n + 13] ^= wo;
            y->w[n + 11] ^= wo;
            y->w[n + 7] ^= wo;
        }
        if (y->w[33] == 0) mr_lzero(y);
        return;
    }
#endif

#if MIRACL == 32
    if (M == 1223 && A == 255) {
        y->len = 39;
        for (i = 0; i < 39; i++) {
            n = i / 2;
            w = wk[i];

            we = evens[((w & 0x5) + ((w & 0x50) >> 3))];
            wo = odds[((w & 0xA) + ((w & 0xA0) >> 5))];
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 4;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 4;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 8;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 8;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 12;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 12;

            i++;
            if (i < 39) {
                w = wk[i];

                we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 16;
                wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 16;
                w >>= 8;
                we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 20;
                wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 20;
                w >>= 8;
                we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 24;
                wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 24;
                w >>= 8;
                we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 28;
                wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 28;
            }
            y->w[n] ^= we;

            y->w[20 + n - 1] ^= wo << 4;
            y->w[20 + n] ^= wo >> 28;

            y->w[n + 4] ^= wo;
        }
        if (y->w[38] == 0) mr_lzero(y);
        return;
    }

#endif

#if MIRACL == 64
    if (M == 1223 && A == 255) {
        y->len = 20;
        for (i = 0; i < 20; i++) {
            n = i / 2;
            w = wk[i];

            we = evens[((w & 0x5) + ((w & 0x50) >> 3))];
            wo = odds[((w & 0xA) + ((w & 0xA0) >> 5))];
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 4;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 4;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 8;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 8;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 12;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 12;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 16;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 16;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 20;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 20;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 24;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 24;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 28;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 28;

            i++;
            w = wk[i];

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 32;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 32;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 36;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 36;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 40;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 40;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 44;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 44;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 48;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 48;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 52;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 52;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 56;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 56;
            w >>= 8;
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 60;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 60;

            y->w[n] ^= we;

            y->w[10 + n - 1] ^= wo << 36;
            y->w[10 + n] ^= wo >> 28;

            y->w[n + 2] ^= wo;
        }
        if (y->w[19] == 0) mr_lzero(y);
        return;
    }

#endif

    k = 1 + (M / MIRACL);
    h = (k + 1) / 2;

    a  = (A + 1) / 2;
    aw = a / MIRACL;
    ab = a % MIRACL;

    if (B) {
        a  = (B + 1) / 2;
        bw = a / MIRACL;
        bb = a % MIRACL;

        a  = (C + 1) / 2;
        cw = a / MIRACL;
        cb = a % MIRACL;
    }
    s = h * MIRACL - 1 - (M - 1) / 2;

    y->len = k;
    for (i = 0; i < k; i++) {
        n = i / 2;
        w = wk[i];

#if MIRACL == 32

        we = evens[((w & 0x5) + ((w & 0x50) >> 3))];
        wo = odds[((w & 0xA) + ((w & 0xA0) >> 5))];
        w >>= 8;

        we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 4;
        wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 4;
        w >>= 8;

        we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 8;
        wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 8;
        w >>= 8;

        we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 12;
        wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 12;

#else
        mm = 0;
        we = wo = 0;
        for (j = 0; j < MIRACL / 8; j++) {
            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << mm;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << mm;
            mm += 4;
            w >>= 8;
        }

#endif
        i++;
        if (i < k) {
            w = wk[i];
#if MIRACL == 32

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 16;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 16;
            w >>= 8;

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 20;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 20;
            w >>= 8;

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 24;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 24;
            w >>= 8;

            we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << 28;
            wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << 28;

#else
            for (j = 0; j < MIRACL / 8; j++) {
                we |= evens[((w & 0x5) + ((w & 0x50) >> 3))] << mm;
                wo |= odds[((w & 0xA) + ((w & 0xA0) >> 5))] << mm;
                mm += 4;
                w >>= 8;
            }

#endif
        }
        y->w[n] ^= we;

        if (s == 0)
            y->w[h + n] = wo;
        else {
            y->w[h + n - 1] ^= wo << (MIRACL - s);
            y->w[h + n] ^= wo >> s; /* abutt odd bits to even */
        }
        if (ab == 0)
            y->w[n + aw] ^= wo;
        else {
            y->w[n + aw] ^= wo << ab;
            y->w[n + aw + 1] ^= wo >> (MIRACL - ab);
        }
        if (B) {
            if (bb == 0)
                y->w[n + bw] ^= wo;
            else {
                y->w[n + bw] ^= wo << bb;
                y->w[n + bw + 1] ^= wo >> (MIRACL - bb);
            }
            if (cb == 0)
                y->w[n + cw] ^= wo;
            else {
                y->w[n + cw] ^= wo << cb;
                y->w[n + cw + 1] ^= wo >> (MIRACL - cb);
            }
        }
    }

    if (y->w[k - 1] == 0) mr_lzero(y);
}

#ifndef MR_STATIC

void power2(_MIPD_ big x, int m, big w)
{ /* w=x^m mod f. Could be optimised a lot, but not time critical for me */
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    copy(x, mr_mip->w1);

    convert(_MIPP_ 1, w);
    forever
    {
        if (m % 2 != 0) modmult2(_MIPP_ w, mr_mip->w1, w);
        m /= 2;
        if (m == 0) break;
        modsquare2(_MIPP_ mr_mip->w1, mr_mip->w1);
    }
}

#endif

/* Euclidean Algorithm */

BOOL inverse2(_MIPD_ big x, big w)
{
    mr_small bit;
    int      i, j, n, n3, k, n4, mb, mw;
    big      t;
    BOOL     newword;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    if (size(x) == 0) return FALSE;

    convert(_MIPP_ 1, mr_mip->w1);
    zero(mr_mip->w2);
    copy(x, mr_mip->w3);
    copy(mr_mip->modulus, mr_mip->w4);

    n3 = numbits(mr_mip->w3);
    n4 = mr_mip->M + 1;

#ifdef MR_COUNT_OPS
    fpi2++;
#endif

    while (n3 != 1) {
        j = n3 - n4;

        if (j < 0) {
            t          = mr_mip->w3;
            mr_mip->w3 = mr_mip->w4;
            mr_mip->w4 = t;
            t          = mr_mip->w1;
            mr_mip->w1 = mr_mip->w2;
            mr_mip->w2 = t;
            j          = -j;
            n          = n3;
            n3         = n4;
            n4         = n;
        }

        mw = j / MIRACL;
        mb = j % MIRACL;

        if (n3 < MIRACL) {
            mr_mip->w3->w[0] ^= mr_mip->w4->w[0] << mb;
            n3--;

            bit = ((mr_small)1 << ((n3 - 1) % MIRACL));

            while (!(mr_mip->w3->w[0] & bit)) {
                n3--;
                bit >>= 1;
            }
        } else {
            k = mr_mip->w3->len;
            if (mb == 0) {
                for (i = mw; i < k; i++)
                    mr_mip->w3->w[i] ^= mr_mip->w4->w[i - mw];
            } else {
                mr_mip->w3->w[mw] ^= mr_mip->w4->w[0] << mb;
                for (i = mw + 1; i < k; i++)
                    mr_mip->w3->w[i] ^=
                        ((mr_mip->w4->w[i - mw] << mb) |
                         (mr_mip->w4->w[i - mw - 1] >> (MIRACL - mb)));
            }

            newword = FALSE;
            while (mr_mip->w3->w[k - 1] == 0) {
                k--;
                newword = TRUE;
            }

            /*
                bit=mr_mip->w3->w[k-1];
                ASM mov eax,bit
                ASM bsr ecx,eax
                ASM mov shift,ecx
                n3=(k-1)*MIRACL+shift+1;

            */
            if (newword) {
                bit = TOPBIT;
                n3  = k * MIRACL;
            } else {
                n3--;
                bit = ((mr_small)1 << ((n3 - 1) % MIRACL));
            }
            while (!(mr_mip->w3->w[k - 1] & bit)) {
                n3--;
                bit >>= 1;
            }

            mr_mip->w3->len = k;
        }

        k = mr_mip->w2->len + mw + 1;
        if ((int)mr_mip->w1->len > k) k = mr_mip->w1->len;

        if (mb == 0) {
            for (i = mw; i < k; i++) mr_mip->w1->w[i] ^= mr_mip->w2->w[i - mw];
        } else {
            mr_mip->w1->w[mw] ^= mr_mip->w2->w[0] << mb;
            for (i = mw + 1; i < k; i++)
                mr_mip->w1->w[i] ^=
                    ((mr_mip->w2->w[i - mw] << mb) |
                     (mr_mip->w2->w[i - mw - 1] >> (MIRACL - mb)));
        }
        while (mr_mip->w1->w[k - 1] == 0) k--;
        mr_mip->w1->len = k;
    }

    copy(mr_mip->w1, w);
    return TRUE;
}

/* Schroeppel, Orman, O'Malley, Spatscheck    *
 * "Almost Inverse" algorithm, Crypto '95     *
 * More optimization here and in-lining would *
 * speed up AFFINE mode. I observe that       *
 * pentanomials would be more efficient if C  *
 * were greater                               */

/*
BOOL inverse2(_MIPD_ big x,big w)
{
    mr_small lsw,*gw;
    int i,n,bits,step,n3,n4,k;
    int k1,k2,k3,k4,ls1,ls2,ls3,ls4,rs1,rs2,rs3,rs4;
    int M,A,B,C;
    big t;
#ifdef MR_OS_THREADS
    miracl *mr_mip=get_mip();
#endif
    if (size(x)==0) return FALSE;

    M=mr_mip->M;
    A=mr_mip->AA;
    if (A==0)
    {
        mr_berror(_MIPP_ MR_ERR_NO_BASIS);
        return FALSE;
    }

    B=mr_mip->BB;
    C=mr_mip->CC;
    convert(_MIPP_ 1,mr_mip->w1);
    zero(mr_mip->w2);
    copy(x,mr_mip->w3);
    copy(mr_mip->modulus,mr_mip->w4);

    bits=zerobits(mr_mip->w3);
    shiftrightbits(mr_mip->w3,bits);
    k=bits;
    n3=numbits(mr_mip->w3);
    n4=M+1;

    if (n3>1) forever
    {
        if (n3<n4)
        {
            t=mr_mip->w3; mr_mip->w3=mr_mip->w4; mr_mip->w4=t;
            t=mr_mip->w1; mr_mip->w1=mr_mip->w2; mr_mip->w2=t;
            n=n3; n3=n4; n4=n;
        }

        add2(mr_mip->w3,mr_mip->w4,mr_mip->w3);

        add2(mr_mip->w1,mr_mip->w2,mr_mip->w1);

        if (n3==n4) n3=numbits(mr_mip->w3);
        bits=zerobits(mr_mip->w3);
        k+=bits;
        n3-=bits;
        if (n3==1) break;
        shiftrightbits(mr_mip->w3,bits);
        shiftleftbits(mr_mip->w2,bits);
   }

    copy(mr_mip->w1,w);

    if (k==0)
    {
        mr_lzero(w);
        return TRUE;
    }
    step=MIRACL;

    if (A<MIRACL) step=A;

    k1=1+M/MIRACL;
    rs1=M%MIRACL;
    ls1=MIRACL-rs1;

    k2=1+A/MIRACL;
    rs2=A%MIRACL;
    ls2=MIRACL-rs2;

    if (B)
    {
        if (C<MIRACL) step=C;

        k3=1+B/MIRACL;
        rs3=B%MIRACL;
        ls3=MIRACL-rs3;

        k4=1+C/MIRACL;
        rs4=C%MIRACL;
        ls4=MIRACL-rs4;
    }

    gw=w->w;
    while (k>0)
    {
        if (k>step) n=step;
        else        n=k;

        if (n==MIRACL) lsw=gw[0];
        else           lsw=gw[0]&(((mr_small)1<<n)-1);

        w->len=k1;
        if (rs1==0) gw[k1-1]^=lsw;
        else
        {
            w->len++;
            gw[k1]^=(lsw>>ls1);
            gw[k1-1]^=(lsw<<rs1);
        }
        if (rs2==0) gw[k2-1]^=lsw;
        else
        {
            gw[k2]^=(lsw>>ls2);
            gw[k2-1]^=(lsw<<rs2);
        }
        if (B)
        {
            if (rs3==0) gw[k3-1]^=lsw;
            else
            {
                gw[k3]^=(lsw>>ls3);
                gw[k3-1]^=(lsw<<rs3);
            }
            if (rs4==0) gw[k4-1]^=lsw;
            else
            {
                gw[k4]^=(lsw>>ls4);
                gw[k4-1]^=(lsw<<rs4);
            }
        }
        shiftrightbits(w,n);
        k-=n;
    }
    mr_lzero(w);
    return TRUE;
}

*/

BOOL multi_inverse2(_MIPD_ int m, big* x, big* w)
{ /* find w[i]=1/x[i] mod f, for i=0 to m-1 */
    int i;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    if (m == 0) return TRUE;
    if (m < 0) return FALSE;

    if (x == w) {
        mr_berror(_MIPP_ MR_ERR_BAD_PARAMETERS);
        return FALSE;
    }
    if (m == 1) {
        inverse2(_MIPP_ x[0], w[0]);
        return TRUE;
    }
    convert(_MIPP_ 1, w[0]);
    copy(x[0], w[1]);
    for (i = 2; i < m; i++) modmult2(_MIPP_ w[i - 1], x[i - 1], w[i]);

    modmult2(_MIPP_ w[m - 1], x[m - 1], mr_mip->w6);
    if (size(mr_mip->w6) == 0) {
        mr_berror(_MIPP_ MR_ERR_DIV_BY_ZERO);
        return FALSE;
    }

    inverse2(_MIPP_ mr_mip->w6, mr_mip->w6); /* y=1/y */

    copy(x[m - 1], mr_mip->w5);
    modmult2(_MIPP_ w[m - 1], mr_mip->w6, w[m - 1]);

    for (i = m - 2;; i--) {
        if (i == 0) {
            modmult2(_MIPP_ mr_mip->w5, mr_mip->w6, w[0]);
            break;
        }
        modmult2(_MIPP_ w[i], mr_mip->w5, w[i]);
        modmult2(_MIPP_ w[i], mr_mip->w6, w[i]);
        modmult2(_MIPP_ mr_mip->w5, x[i], mr_mip->w5);
    }
    return TRUE;
}

#ifndef MR_STATIC

int trace2(_MIPD_ big x)
{
    int i;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    copy(x, mr_mip->w1);
    for (i = 1; i < mr_mip->M; i++) {
        modsquare2(_MIPP_ mr_mip->w1, mr_mip->w1);
        add2(mr_mip->w1, x, mr_mip->w1);
    }
    return (int)(mr_mip->w1->w[0] & 1);
}

#endif

#ifndef MR_NO_RAND

void rand2(_MIPD_ big x)
{ /* random number */
    int i, k;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    zero(x);
    k      = 1 + mr_mip->M / MIRACL;
    x->len = k;
    for (i = 0; i < k; i++) x->w[i] = brand(_MIPPO_);
    mr_lzero(x);
    reduce2(_MIPP_ x, x);
}

#endif

int parity2(big x)
{ /* return LSB */
    if (x->len == 0) return 0;
    return (int)(x->w[0] % 2);
}

void halftrace2(_MIPD_ big b, big w)
{
    int i, M;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

    M = mr_mip->M;
    if (M % 2 == 0) return;

    copy(b, mr_mip->w1);
    copy(b, w);

    for (i = 1; i <= (M - 1) / 2; i++) {
        modsquare2(_MIPP_ w, w);
        modsquare2(_MIPP_ w, w);
        add2(w, mr_mip->w1, w);
    }
}

BOOL quad2(_MIPD_ big b, big w)
{ /* Solves x^2 + x = b  for a root w  *
   * returns TRUE if a solution exists *
   * the "other" solution is w+1       */
    int i, M;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif

    M = mr_mip->M;
    copy(b, mr_mip->w1);
    if (M % 2 == 1)
        halftrace2(_MIPP_ b, w); /* M is odd, so its the Half-Trace */

    else {
        zero(mr_mip->w2);
        forever
        {
#ifndef MR_NO_RAND
            rand2(_MIPP_ mr_mip->w2);
#else
            incr(_MIPP_ mr_mip->w2, 1, mr_mip->w2);
#endif
            zero(w);
            copy(mr_mip->w2, mr_mip->w3);
            for (i = 1; i < M; i++) {
                modsquare2(_MIPP_ mr_mip->w3, mr_mip->w3);
                modmult2(_MIPP_ mr_mip->w3, mr_mip->w1, mr_mip->w4);
                modsquare2(_MIPP_ w, w);
                add2(w, mr_mip->w4, w);
                add2(mr_mip->w3, mr_mip->w2, mr_mip->w3);
            }
            if (size(mr_mip->w3) != 0) break;
        }
    }

    copy(w, mr_mip->w2);
    modsquare2(_MIPP_ mr_mip->w2, mr_mip->w2);
    add2(mr_mip->w2, w, mr_mip->w2);
    if (mr_compare(mr_mip->w1, mr_mip->w2) == 0) return TRUE;
    return FALSE;
}

#ifndef MR_STATIC

void gf2m_dotprod(_MIPD_ int n, big* x, big* y, big w)
{ /* dot product - only one reduction! */
    int i;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    mr_mip->check = OFF;
    zero(mr_mip->w5);

    for (i = 0; i < n; i++) {
        multiply2(_MIPP_ x[i], y[i], mr_mip->w0);
        add2(mr_mip->w5, mr_mip->w0, mr_mip->w5);
    }

    reduce2(_MIPP_ mr_mip->w5, mr_mip->w5);
    copy(mr_mip->w5, w);

    mr_mip->check = ON;
}

#endif

BOOL prepare_basis(_MIPD_ int m, int a, int b, int c, BOOL check)
{
    int i, k, sh;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    if (mr_mip->ERNUM) return FALSE;

    if (b == 0) c = 0;
    if (m == mr_mip->M && a == mr_mip->AA && b == mr_mip->BB && c == mr_mip->CC)
        return TRUE; /* its already prepared... */

    MR_IN(138)
    if (m <= 0 || a <= 0 || a >= m || b >= a) {
        mr_berror(_MIPP_ MR_ERR_BAD_MODULUS);
        MR_OUT
        return FALSE;
    }

    mr_mip->M  = m;
    mr_mip->AA = a;
    mr_mip->BB = 0;
    mr_mip->CC = 0;
    zero(mr_mip->modulus);
    convert(_MIPP_ 1, mr_mip->one);

    k = 1 + m / MIRACL;

    if (k > mr_mip->nib) {
        mr_berror(_MIPP_ MR_ERR_OVERFLOW);
        MR_OUT
        return FALSE;
    }

    mr_mip->modulus->len      = k;
    sh                        = m % MIRACL;
    mr_mip->modulus->w[k - 1] = ((mr_small)1 << sh);
    mr_mip->modulus->w[0] ^= 1;
    mr_mip->modulus->w[a / MIRACL] ^= ((mr_small)1 << (a % MIRACL));
    if (b != 0) {
        mr_mip->BB = b;
        mr_mip->CC = c;
        mr_mip->modulus->w[b / MIRACL] ^= ((mr_small)1 << (b % MIRACL));
        mr_mip->modulus->w[c / MIRACL] ^= ((mr_small)1 << (c % MIRACL));
    }

    if (!check) {
        MR_OUT
        return TRUE;
    }

    /* check for irreducibility of basis */

    zero(mr_mip->w4);
    mr_mip->w4->len  = 1;
    mr_mip->w4->w[0] = 2; /* f(t) = t */
    for (i = 1; i <= m / 2; i++) {
        modsquare2(_MIPP_ mr_mip->w4, mr_mip->w4);
        incr2(mr_mip->w4, 2, mr_mip->w5);
        gcd2(_MIPP_ mr_mip->w5, mr_mip->modulus, mr_mip->w6);
        if (size(mr_mip->w6) != 1) {
            mr_berror(_MIPP_ MR_ERR_NOT_IRREDUC);
            MR_OUT
            return FALSE;
        }
    }

    MR_OUT
    return TRUE;
}

#endif
